From c4c23a635b034a247192bd6237e23de189c262a3 Mon Sep 17 00:00:00 2001 From: tsteven4 Date: Tue, 17 Sep 2013 01:12:47 +0000 Subject: [PATCH] change illegal xml character stripping from QRegExp/QRegularExpression to QTextCodec for efficiency. Thanks Conrad. git-svn-id: http://gpsbabel.googlecode.com/svn/trunk@4616 f51c46e8-681c-474f-0cfe-069cfd0219fb --- gpsbabel/gpx.cc | 1 - gpsbabel/kml.cc | 2 - gpsbabel/src/core/xmlstreamwriter.cc | 82 +++++++++++++++++----------- gpsbabel/src/core/xmlstreamwriter.h | 42 +++++++------- 4 files changed, 73 insertions(+), 54 deletions(-) diff --git a/gpsbabel/gpx.cc b/gpsbabel/gpx.cc index 35fbfa90b..b451f1786 100644 --- a/gpsbabel/gpx.cc +++ b/gpsbabel/gpx.cc @@ -1257,7 +1257,6 @@ gpx_wr_init(const char* fname) writer = new gpsbabel::XmlStreamWriter(oqfile); writer->setAutoFormattingIndent(2); - writer->setCodec("UTF-8"); writer->writeStartDocument(); } diff --git a/gpsbabel/kml.cc b/gpsbabel/kml.cc index e01a53403..caa51b9dc 100644 --- a/gpsbabel/kml.cc +++ b/gpsbabel/kml.cc @@ -427,7 +427,6 @@ kml_wr_init(const char* fname) writer = new gpsbabel::XmlStreamWriter(oqfile); writer->setAutoFormattingIndent(2); - writer->setCodec("UTF-8"); } /* @@ -446,7 +445,6 @@ kml_wr_position_init(const char* fname) * matters in this mode, turn the pretty formatting off. */ writer->setAutoFormatting(false); - writer->setCodec("UTF-8"); max_position_points = atoi(opt_max_position_points); } diff --git a/gpsbabel/src/core/xmlstreamwriter.cc b/gpsbabel/src/core/xmlstreamwriter.cc index abfb230d7..d47e341cd 100644 --- a/gpsbabel/src/core/xmlstreamwriter.cc +++ b/gpsbabel/src/core/xmlstreamwriter.cc @@ -18,14 +18,9 @@ */ #include -#include #include -#if (QT_VERSION < QT_VERSION_CHECK(5, 0, 0)) -#include -#else -#include -#endif +#include #include // As this code began in C, we have several hundred places that write @@ -37,50 +32,75 @@ namespace gpsbabel { -XmlStreamWriter:: XmlStreamWriter(QString* s) : QXmlStreamWriter(s) {} +XmlTextCodec* XmlTextCodec::instance = new XmlTextCodec(); -XmlStreamWriter::XmlStreamWriter(QFile* f) : QXmlStreamWriter(f) {} - -#if (QT_VERSION < QT_VERSION_CHECK(5, 0, 0)) -QRegExp XmlStreamWriter::badXml10 = QRegExp("[\\x0000-\\x0008]|[\\x000b-\\x000c]|[\\x000e-\\x001f]"); -#else -QRegularExpression XmlStreamWriter::badXml10 = QRegularExpression("[\\x00-\\x08]|[\\x0b-\\x0c]|[\\x0e-\\x1f]"); -#endif +XmlTextCodec::XmlTextCodec() : QTextCodec() +{ + utf8Codec = QTextCodec::codecForName("UTF-8"); +} -// Dont emit the attribute if there's nothing interesting in it. -void XmlStreamWriter::writeOptionalAttribute(const QString& qualifiedName, QString value) +QByteArray XmlTextCodec::convertFromUnicode(const QChar* chars, int len, QTextCodec::ConverterState* state) const { - if (!value.isEmpty()) { - QXmlStreamWriter::writeAttribute(qualifiedName, value.replace(badXml10, " ")); +// Qt 4.7.4, 4.6.2 don't have IgnoreHeader set on the first call, which can +// result in a BOM being output by utf8Codec. + state->flags |= QTextCodec::IgnoreHeader; + QByteArray r = utf8Codec->fromUnicode(chars, len, state); + char* data = r.data(); + for (int i = 0; i < r.size(); i++) { + if ((0x00 <= data[i] && data[i] <= 0x08) || + (0x0b <= data[i] && data[i] <= 0x0c) || + (0x0e <= data[i] && data[i] <= 0x1f)) { + data[i] = ' '; + } } + return r; } -// Dont emit the element if there's nothing interesting in it. -void XmlStreamWriter::writeOptionalTextElement(const QString& qualifiedName, QString text) +QString XmlTextCodec::convertToUnicode(const char* chars, int len, QTextCodec::ConverterState* state) const { - if (!text.isEmpty()) { - QXmlStreamWriter::writeTextElement(qualifiedName, text.replace(badXml10, " ")); - } + return utf8Codec->toUnicode(chars, len, state); +} + +int XmlTextCodec::mibEnum() const +{ + return UTF8_FOR_XML_MIB; } -void XmlStreamWriter::writeAttribute(const QString& qualifiedName, QString value) +// Our name must not overlap with UTF-8 or it may be returned by QTextCodec::codecForName("UTF-8") +QByteArray XmlTextCodec::name() const { - QXmlStreamWriter::writeAttribute(qualifiedName, value.replace(badXml10, " ")); + return QByteArray("UTF-8-XML"); } -void XmlStreamWriter::writeCDATA(QString text) +XmlStreamWriter::XmlStreamWriter(QString* string) : QXmlStreamWriter(string) { - QXmlStreamWriter::writeCDATA(text.replace(badXml10, " ")); } -void XmlStreamWriter::writeCharacters(QString text) +XmlStreamWriter::XmlStreamWriter(QFile* f) : QXmlStreamWriter(f) { - QXmlStreamWriter::writeCharacters(text.replace(badXml10, " ")); + setCodec(XmlTextCodec::instance); } -void XmlStreamWriter::writeTextElement(const QString& qualifiedName, QString value) +// We must overide the encoding, we don't want to use XmlTextCode::name(). +void XmlStreamWriter::writeStartDocument() { - QXmlStreamWriter::writeTextElement(qualifiedName, value.replace(badXml10, " ")); + writeProcessingInstruction("xml version=\"1.0\" encoding=\"UTF-8\""); +} + +// Dont emit the attribute if there's nothing interesting in it. +void XmlStreamWriter::writeOptionalAttribute(const QString& qualifiedName, const QString& value) +{ + if (!value.isEmpty()) { + QXmlStreamWriter::writeAttribute(qualifiedName, value); + } +} + +// Dont emit the element if there's nothing interesting in it. +void XmlStreamWriter::writeOptionalTextElement(const QString& qualifiedName, const QString& text) +{ + if (!text.isEmpty()) { + QXmlStreamWriter::writeTextElement(qualifiedName, text); + } } } // namespace gpsbabel diff --git a/gpsbabel/src/core/xmlstreamwriter.h b/gpsbabel/src/core/xmlstreamwriter.h index 5a15f3a5e..4dc27fb19 100644 --- a/gpsbabel/src/core/xmlstreamwriter.h +++ b/gpsbabel/src/core/xmlstreamwriter.h @@ -20,39 +20,41 @@ #ifndef XMLSTREAMWRITER_H #define XMLSTREAMWRITER_H -#include +#include #include class QFile; -#if (QT_VERSION < QT_VERSION_CHECK(5, 0, 0)) -class QRegExp; -#else -class QRegularExpression; -#endif namespace gpsbabel { -class XmlStreamWriter : public QXmlStreamWriter +// From the "vendor" range, see: +// https://www.iana.org/assignments/character-sets/character-sets.xhtml +const int UTF8_FOR_XML_MIB = 2000; + +class XmlTextCodec : public QTextCodec { private: -#if (QT_VERSION < QT_VERSION_CHECK(5, 0, 0)) - static QRegExp badXml10; -#else - static QRegularExpression badXml10; -#endif + QTextCodec* utf8Codec; +public: + XmlTextCodec(); + static XmlTextCodec *instance; + virtual QByteArray name() const; + virtual int mibEnum() const; +protected: + virtual QByteArray convertFromUnicode(const QChar* chars, int len, QTextCodec::ConverterState* state) const; + virtual QString convertToUnicode(const char* chars, int len, QTextCodec::ConverterState* state) const; +}; +class XmlStreamWriter : public QXmlStreamWriter +{ public: - XmlStreamWriter(QString* s); + XmlStreamWriter(QString* string); XmlStreamWriter(QFile* f); - void writeOptionalAttribute(const QString& qualifiedName, QString value); - void writeOptionalTextElement(const QString& qualifiedName, QString text); - void writeAttribute(const QString& qualifiedName, QString value); - void writeCDATA(QString text); - void writeCharacters(QString text); - void writeTextElement(const QString& qualifiedName, QString value); - + void writeStartDocument(void); + void writeOptionalAttribute(const QString& qualifiedName, const QString& value); + void writeOptionalTextElement(const QString& qualifiedName, const QString& text); }; } // namespace gpsbabel -- 2.30.2